# Imports
import pandas as pd
import matplotlib.pyplot as plt
import pingouin
from scipy.stats import mannwhitneyu

# Load the datasets
men = pd.read_csv("men_results.csv")
women = pd.read_csv("women_results.csv")

men.head(3)

women.head(3)

men.drop('Unnamed: 0', axis = 1, inplace = True)
women.drop('Unnamed: 0', axis = 1, inplace= True)

# Filter the data for the time range and tournament
men["date"] = pd.to_datetime(men["date"])
men_subset = men[(men["date"] > "2002-01-01") & (men["tournament"].isin(["FIFA World Cup"]))].copy()

women["date"] = pd.to_datetime(women["date"])
women_subset = women[(women["date"] > "2002-01-01") & (women["tournament"].isin(["FIFA World Cup"]))].copy()

# Create group and goals_scored columns
men_subset["group"] = "men"
women_subset["group"] = "women"
men_subset["goals_scored"] = men_subset["home_score"] + men_subset["away_score"]
women_subset["goals_scored"] = women_subset["home_score"] + women_subset["away_score"]

# Determine normality using histograms
women_subset["goals_scored"].hist()
plt.title("Women Goals Scored Distribution")
plt.show()
plt.clf()

men_subset["goals_scored"].hist()
plt.title("Men Goals Scored Distribution")
plt.show()
plt.clf()

<Figure size 640x480 with 0 Axes>

# Combine women's and men's data and calculate goals scored in each match
both = pd.concat([women_subset, men_subset], axis=0, ignore_index=True)

# Transform the data for the pingouin Mann-Whitney U t-test/Wilcoxon-Mann-Whitney test
both_subset = both[["goals_scored", "group"]]
both_subset_wide = both_subset.pivot(columns="group", values="goals_scored")

# Perform right-tailed Wilcoxon-Mann-Whitney test with pingouin
results_pg = pingouin.mwu(x=both_subset_wide["women"],
                          y=both_subset_wide["men"],
                          alternative="greater")

# Alternative SciPy solution
results_scipy = mannwhitneyu(x=women_subset["goals_scored"],
                             y=men_subset["goals_scored"],
                             alternative="greater")

# Extract p-value as a float
p_val = results_pg["p-val"].values[0]

# Determine hypothesis test result using sig. level
if p_val <= 0.01:
    result = "reject"
else:
    result = "fail to reject"

result_dict = {"p_val": p_val, "result": result}

result_dict

{'p_val': 0.005106609825443641, 'result': 'reject'}

	Unnamed: 0	date	home_team	away_team	home_score	away_score	tournament
0	0	1872-11-30	Scotland	England	0	0	Friendly
1	1	1873-03-08	England	Scotland	4	2	Friendly
2	2	1874-03-07	Scotland	England	2	1	Friendly

	Unnamed: 0	date	home_team	away_team	home_score	away_score	tournament
0	0	1969-11-01	Italy	France	1	0	Euro
1	1	1969-11-01	Denmark	England	4	3	Euro
2	2	1969-11-02	England	France	2	0	Euro

Project Description¶

Let's get started!¶